In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import plotly.graph_objects as go
In [2]:
data = pd.read_csv(r'/home/apenwell/DataFun/torchlight_practice_data.csv') #change this line to ur file path
print(data)
       url product_facet   metadata_facet contenttype_facet page_title
0    http1     openshift     installation     documentation    Title 1
1    http1     openshift     installation     documentation    Title 1
2    http2          rhel        migration          solution    Title 2
3    http2          rhel        migration          solution    Title 2
4    http2          rhel        migration          solution    Title 2
5    http3     sateliite      development     documentation    Title 3
6    http4         cloud   administration          security    Title 4
7    http5     openstack        migration           article    Title 5
8    http6         jboss  troubleshooting             video    Title 6
9    http7       ansible  troubleshooting     documentation    Title 7
10   http7       ansible  troubleshooting     documentation    Title 7
11   http7       ansible  troubleshooting     documentation    Title 7
12   http7       ansible  troubleshooting     documentation    Title 7
13   http8     data grid   administration          solution    Title 8
14   http9       gluster     installation          security    Title 9
15  http10          ceph  troubleshooting          solution   Title 10
In [3]:
#finding all unique values in the metadata_facet column and outputting a count of each value

metadata_counts = data['metadata_facet'].value_counts()
print(metadata_counts)
cnum = np.arange(len(metadata_counts))
troubleshooting    6
migration          4
installation       3
administration     2
development        1
Name: metadata_facet, dtype: int64
In [4]:
#putting the above output into a bar graph

plt.figure()
plt.barh(cnum,metadata_counts, color = ('darkred','firebrick', 'red', 'tomato', 'lightsalmon'))
#ticks_loc = plt.xticks(np.arange(0, 1, step=0.2))  # Set label locations.
ticks = plt.yticks(np.arange(5), ['troubleshooting', 'migration', 'installation', 'administration',
                                 'development'])  # Set text labels and properties
plt.ylabel('Metadata Facet')
plt.xlabel('Number of resources')
plt.title('Resources by Metadata Facet')
#plt.savefig('metadata_bar_chart.png', transparent = False)
Out[4]:
Text(0.5, 1.0, 'Resources by Metadata Facet')
In [5]:
#putting the same data into a pie chart

plt.figure()
mylabels = ['troubleshooting', 'migration', 'installation', 'administration','development']
r = plt.pie(metadata_counts, labels = mylabels, autopct='%.0f%%')
plt.title('Resources by Metadata Facet')
Out[5]:
Text(0.5, 1.0, 'Resources by Metadata Facet')
In [3]:
#working on gauges of percentages

fig = go.Figure(go.Indicator(
    domain = {'x': [0, 1], 'y': [0, 1]},
    value = 80,
    mode = "gauge+number+delta",
    number = {'suffix': "%"},
    title = {'text': "Docs Impressions vs Docs Clicked"},
    delta = {'reference': 60}, #the number here could represent the percentages from the previous year
    gauge = {'axis': {'range': [None, 100]},
             'bar': {'color': "red"},
             'steps' : [
                 {'range': [0, 100], 'color': "lightgray"}],
             'threshold' : {'line': {'color': "green", 'width': 4}, 'thickness': 0.75, 'value': 90}}))

fig.show()

#the gauge would go from 0-100 to represent a percentage
#the red progress bar would show the percentage from the current month
#the increase/decrease below the percentage number is calculating the difference between the reference value,
    #delta and the current value, value
#the green line would mark the "goal" for that given percentage (ex below: here we want a 90% CTR)
In [4]:
#working on gauges of percentages - option 2

fig2 = go.Figure(go.Indicator(
    domain = {'x': [0, 1], 'y': [0, 1]},
    value = 80,
    mode = "gauge+number+delta",
    number = {'suffix': "%"},
    title = {'text': "Docs Impressions vs Docs Clicked"},
    delta = {'reference': 60}, #the number here could represent the percentages from the previous year
    gauge = {'axis': {'range': [None, 100]},
             'bar': {'color': "black"},
             'steps' : [
                 {'range': [0, 50], 'color': "red"},
                 {'range': [50, 85], 'color': "yellow"},
                 {'range': [85, 100], 'color': "green"}]}))

fig2.show()
In [8]:
#grouping data tests

product_counts = data['product_facet'].value_counts()

grouped = data.groupby('product_facet')

for key, item in grouped:
    print(grouped.get_group(key), "\n\n")
      url product_facet   metadata_facet contenttype_facet page_title
9   http7       ansible  troubleshooting     documentation    Title 7
10  http7       ansible  troubleshooting     documentation    Title 7
11  http7       ansible  troubleshooting     documentation    Title 7
12  http7       ansible  troubleshooting     documentation    Title 7 


       url product_facet   metadata_facet contenttype_facet page_title
15  http10          ceph  troubleshooting          solution   Title 10 


     url product_facet  metadata_facet contenttype_facet page_title
6  http4         cloud  administration          security    Title 4 


      url product_facet  metadata_facet contenttype_facet page_title
13  http8     data grid  administration          solution    Title 8 


      url product_facet metadata_facet contenttype_facet page_title
14  http9       gluster   installation          security    Title 9 


     url product_facet   metadata_facet contenttype_facet page_title
8  http6         jboss  troubleshooting             video    Title 6 


     url product_facet metadata_facet contenttype_facet page_title
0  http1     openshift   installation     documentation    Title 1
1  http1     openshift   installation     documentation    Title 1 


     url product_facet metadata_facet contenttype_facet page_title
7  http5     openstack      migration           article    Title 5 


     url product_facet metadata_facet contenttype_facet page_title
2  http2          rhel      migration          solution    Title 2
3  http2          rhel      migration          solution    Title 2
4  http2          rhel      migration          solution    Title 2 


     url product_facet metadata_facet contenttype_facet page_title
5  http3     sateliite    development     documentation    Title 3 


In [ ]:
#connecting the database to python. this would go above all the visualization code.

cursor = connection.cursor() #create a cursor so that you can interact with the database

from getpass import getpass
from mysql.connector import connect, Error

try:
    with connect(
        host="localhost",
        user=input("Enter username: "),
        password=getpass("Enter password: "),
        database="torchlight",
    ) as connection:
        print(connection)
except Error as e:
    print(e)
In [ ]:
#selecting all rows for a certain product (rhel in this case). this would go above all the visualization code

select_rhel = """
 SELECT *
 FROM WebBehavior
 WHERE product_facet = 'rhel'
 ORDER BY findability DESC
 """
with connection.cursor() as cursor:
     cursor.execute(select_rhel)
     for row in cursor.fetchall():
         print(row)
In [ ]:
#pie chart for contenttypes example - not actually going to use in this exact way

contenttype = mycursor.execute("Select contenttype_facet from WebBehavior")
result = mycursor.fetchall

contenttype = []
#Marks = []
  
for i in mycursor:
    contenttype.append(i[0])
    #Marks.append(i[1])

#plt.bar(Names, Marks)
#plt.ylim(0, 5)
#plt.xlabel("Name of Students")
#plt.ylabel("Marks of Students")
#plt.title("Student's Information")
#plt.show()

plt.pie(contenttype)
mylabels = ['troubleshooting', 'migration', 'installation', 'administration','development'] #fix this line
r = plt.pie(metadata_counts, labels = mylabels, autopct='%.0f%%') #fix this line
plt.title('Resources by Content Type')
plt.show()
In [ ]:
#page 1 - Home Page - usable code (hopefully)
#Main Table
select count(*) as "# of Docs" from WebBehavior;
select count(*) as "# of Docs with 200 Status" from WebBehavior where Status = '200';

#Main Visual and Content Type Visual (these are going to be the same...right?)
contenttype_counts = mycursor.execute("select ContentTypeFacet, count(*) from WebBehavior Group By ContentTypeFacet") #gets counts by content type
mylabels = mycursor.execute("select distinct ContentTypeFacet from WebBehavior") #don't know if this will work lol
result = mycursor.fetchall

contenttype_counts = []
#Marks = []
  
for i in mycursor:
    contenttype_counts.append(i[0])

plt.pie()
r = plt.pie(contenttype_counts, labels = mylabels, autopct='%.0f%%') #fix this line
plt.title('All Resources by Content Type')
plt.show()

#Content Type Table (im "ranking" these by total clicks not total queries because I think that makes more sense... is that ok?)
select ContentTypeFacet, sum(TotalClicks) from WebBehavior Group By ContentTypeFacet Order By sum(TotalClicks) DESC;


#Products Visual
product_counts = mycursor.execute("select ProductFacet, count(*) from WebBehavior Group By ProductFacet")
mylabels2 = mycursor.execute("select distinct ProductFacet from WebBehavior") #don't know if this will work lol
result = mycursor.fetchall

product_counts = []

for i in mycursor:
    product_counts.append(i[0])

plt.pie()
r = plt.pie(product_counts, labels = mylabels2, autopct='%.0f%%') #fix this line
plt.title('All Resources by Product')
plt.show()

#Product Table
select ProductFacet, sum(TotalClicks) from WebBehavior Group By ProductFacet Order By sum(TotalClicks) DESC;

#Metadata Visual
metadata_counts = mycursor.execute("select MetadataFacet, count(*) from WebBehavior Group By MetadataFacet")
mylabels3 = mycursor.execute("select distinct MetadataFacet from WebBehavior") #don't know if this will work lol
result = mycursor.fetchall

metadata_counts = []

for i in mycursor:
    metadata_counts.append(i[0])

plt.pie()
r = plt.pie(metadata_counts, labels = mylabels3, autopct='%.0f%%') #fix this line
plt.title('All Resources by Metadata')
plt.show()

#Metadata Table
select MetadataFacet, sum(TotalClicks) from WebBehavior Group By MetadataFacet Order By sum(TotalClicks) DESC;
In [ ]:
#page 2 - Product Page
#Main Table - rhel
#selecting number of docs from a certain product
select count(*) as "# of Docs" from WebBehavior where ProductFacet = 'rhel';
#selecting number of docs with 200 code
select count(*) as "# of Docs with 200 Status" from WebBehavior where ProductFacet = 'rhel' and Status = '200';
#selecting number of docs with 404 code
select count(*) as "# of Docs with 404 Status" from WebBehavior where ProductFacet = 'rhel' and Status = '404';
#Content CTR - rhel
select avg(Total_CTR) from WebBehavior where ProductFacet = 'rhel';
#Content from site search - rhel
#Content from search engines - rhel
#Queries % of search - rhel
#Findability score - rhel
select avg(FindabilityScore) from WebBehavior where ProductFacet = 'rhel';


#Content Type Visual - rhel
rhel_counts = mycursor.execute("select ContentTypeFacet, count(*) from WebBehavior where ProductFacet = 'rhel' Group By ContentTypeFacet")
rhel_labels = mycursor.execute("select distinct ContentTypeFacet from WebBehavior where ProductFacet = 'rhel'") #don't know if this will work lol
result = mycursor.fetchall

rhel_counts = []

for i in mycursor:
    rhel_counts.append(i[0])

plt.pie()
r = plt.pie(rhel_counts, labels = rhel_labels, autopct='%.0f%%') #fix this line
plt.title('RHEL Resources by Content Type')
plt.show()

#Metadata Visual - rhel
rhel_m_counts = mycursor.execute("select MetadataFacet, count(*) from WebBehavior where ProductFacet = 'rhel' Group By MetadataFacet")
rhel_m_labels = mycursor.execute("select distinct MetadataFacet from WebBehavior where ProductFacet = 'rhel'") #don't know if this will work lol
result = mycursor.fetchall

rhel_m_counts = []

for i in mycursor:
    rhel_m_counts.append(i[0])

plt.pie()
r = plt.pie(rhel_m_counts, labels = rhel_m_labels, autopct='%.0f%%') #fix this line
plt.title('RHEL Resources by Metadata Type')
plt.show()

#Top Docs - rhel
select WebURL, FindabilityScore from WebBehavior where ProductFacet = 'rhel' order by FindabilityScore DESC
LIMIT 5;
#Top Queries - rhel
select WebURL, Keyword, TotalQueries from WebBehavior where ProductFacet = 'rhel' order by TotalQueries DESC
LIMIT 5;
#Lowest Docs - rhel
select WebURL, FindabilityScore from WebBehavior where ProductFacet = 'rhel' order by FindabilityScore
LIMIT 5;